{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Spam Classification Model (Sklearn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Wrap a ML model for use as a prediction microservice in seldon-core\n",
    "- Run locally on Docker to test\n",
    "- Deploy on seldon-core running on k8s cluster"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train Locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd\n",
    "from sklearn.externals import joblib\n",
    "from pathlib import Path\n",
    "import string\n",
    "from nltk.stem import SnowballStemmer\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pickle\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import accuracy_score\n",
    "model_path: Path=Path('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"spam.csv\",encoding='latin-1')\n",
    "data = data.drop([\"Unnamed: 2\", \"Unnamed: 3\", \"Unnamed: 4\"], axis=1)\n",
    "data = data.rename(columns={\"v1\":\"class\", \"v2\":\"text\"})\n",
    "data.head()\n",
    "\n",
    "def pre_process(text):\n",
    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
    "    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]\n",
    "    words = \"\"\n",
    "    for i in text:\n",
    "            stemmer = SnowballStemmer(\"english\")\n",
    "            words += (stemmer.stem(i))+\" \"\n",
    "    return words\n",
    "\n",
    "features = data['text'].copy()\n",
    "features = features.apply(pre_process)\n",
    "\n",
    "vectorizer = TfidfVectorizer(\"english\")\n",
    "_features = vectorizer.fit_transform(features)\n",
    "with open('Spam-Classifier/model/vectorizer.pkl', 'wb') as vect:\n",
    "    pickle.dump(vectorizer, vect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma=1.0, kernel='sigmoid',\n",
       "    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,\n",
       "    verbose=False)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer = joblib.load(model_path.joinpath('Spam-Classifier/model/vectorizer.pkl'))\n",
    "train_x, test_x, train_y, test_y = train_test_split(_features, data['class'], test_size=0.3, random_state=0)\n",
    "svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)\n",
    "svc.fit(train_x,train_y)\n",
    "# save the model to disk\n",
    "filename = 'Spam-Classifier/model/model.pkl'\n",
    "pickle.dump(svc, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = joblib.load(model_path.joinpath(filename))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9730861244019139"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction = clf.predict(test_x)\n",
    "accuracy_score(test_y,prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.0220629, 0.9779371]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "message = np.array(['click here to win the price'])\n",
    "data = vectorizer.transform(message).todense()\n",
    "probas = clf.predict_proba(data)\n",
    "probas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['ham', 'spam'], dtype=object)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.classes_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### wrap each model component using s2i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---> Installing application source...\n",
      "---> Installing dependencies ...\n",
      "Looking in links: /whl\n",
      "Collecting scikit-learn==0.21.2 (from -r requirements.txt (line 1))\n",
      "  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "Downloading https://files.pythonhosted.org/packages/85/04/49633f490f726da6e454fddc8e938bbb5bfed2001681118d3814c219b723/scikit_learn-0.21.2-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)\n",
      "Requirement already satisfied: numpy>=1.9.2 in /usr/local/lib/python3.6/site-packages (from -r requirements.txt (line 2)) (1.16.3)\n",
      "Collecting scipy>=0.17.0 (from scikit-learn==0.21.2->-r requirements.txt (line 1))\n",
      "  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "Downloading https://files.pythonhosted.org/packages/29/50/a552a5aff252ae915f522e44642bb49a7b7b31677f9580cfd11bcc869976/scipy-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (25.2MB)\n",
      "Collecting joblib>=0.11 (from scikit-learn==0.21.2->-r requirements.txt (line 1))\n",
      "  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "Downloading https://files.pythonhosted.org/packages/8f/42/155696f85f344c066e17af287359c9786b436b1bf86029bb3411283274f3/joblib-0.14.0-py2.py3-none-any.whl (294kB)\n",
      "Installing collected packages: scipy, joblib, scikit-learn\n",
      "Successfully installed joblib-0.14.0 scikit-learn-0.21.2 scipy-1.3.1\n",
      "Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "You are using pip version 18.1, however version 19.3.1 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
      "Build completed successfully\n"
     ]
    }
   ],
   "source": [
    "!s2i build Spam-Classifier/ seldonio/seldon-core-s2i-python3:0.7 spam-classifier:1.0.0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1b8159f67b7ddbd2de26833411303ebee8e08331097e28754f04688c1fb86d3c\r\n"
     ]
    }
   ],
   "source": [
    "!docker run --name \"spam-classifier\" -d --rm -p 5000:5000 spam-classifier:1.0.0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"data\":{\"ndarray\":[\"0.9779371008528993\",\"spam\"]},\"meta\":{}}\r\n"
     ]
    }
   ],
   "source": [
    "!curl -g http://localhost:5000/predict --data-urlencode 'json={\"data\": {\"names\": [\"message\"], \"ndarray\": [\"click here to win the price\"]}}'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "spam-classifier\r\n"
     ]
    }
   ],
   "source": [
    "!docker rm spam-classifier --force"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---> Installing application source...\n",
      "---> Installing dependencies ...\n",
      "Looking in links: /whl\n",
      "Collecting goslate (from -r requirements.txt (line 1))\n",
      "  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "Downloading https://files.pythonhosted.org/packages/39/0b/50af938a1c3d4f4c595b6a22d37af11ebe666246b05a1a97573e8c8944e5/goslate-1.5.1.tar.gz\n",
      "Requirement already satisfied: numpy in /usr/local/lib/python3.6/site-packages (from -r requirements.txt (line 2)) (1.16.3)\n",
      "Collecting futures (from goslate->-r requirements.txt (line 1))\n",
      "  Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "Downloading https://files.pythonhosted.org/packages/05/80/f41cca0ea1ff69bce7e7a7d76182b47bb4e1a494380a532af3e8ee70b9ec/futures-3.1.1-py3-none-any.whl\n",
      "Building wheels for collected packages: goslate\n",
      "Running setup.py bdist_wheel for goslate: started\n",
      "Running setup.py bdist_wheel for goslate: finished with status 'done'\n",
      "Stored in directory: /root/.cache/pip/wheels/4f/7f/28/6f52271012a7649b54b1a7adaae329b4246bbbf9d1e4f6e51a\n",
      "Successfully built goslate\n",
      "Installing collected packages: futures, goslate\n",
      "Successfully installed futures-3.1.1 goslate-1.5.1\n",
      "Url '/whl' is ignored. It is either a non-existing path or lacks a specific scheme.\n",
      "You are using pip version 18.1, however version 19.3.1 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\n",
      "Build completed successfully\n"
     ]
    }
   ],
   "source": [
    "!s2i build Translator/ seldonio/seldon-core-s2i-python3:0.7 translator:1.0.0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ca18617eed4ee5b12c1ce835d94a677007e3c095166b8e4e5d0f9fd164757814\r\n"
     ]
    }
   ],
   "source": [
    "!docker run --name \"eng-translator\" -d --rm -p 5000:5000 translator:1.0.0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"data\":{\"names\":[\"message\"],\"ndarray\":[\"How is your day\"]},\"meta\":{}}\r\n"
     ]
    }
   ],
   "source": [
    "!curl -g http://localhost:5000/transform-input --data-urlencode 'json={\"data\": {\"names\": [\"message\"], \"ndarray\": [\"Wie läuft dein Tag\"]}}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eng-translator\r\n"
     ]
    }
   ],
   "source": [
    "!docker rm eng-translator --force"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Assuming you have kubernetes cluster running and seldon-core installed, you can deploy your Machine Learning model using:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "kubectl apply -f deploy.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
